\section{Gradient of Convex Conjugate Functional}

Here is our problem. Given an arbitary convex function $f(\textbf{x})$, we want to derive the gradient of the following functional,

\begin{equation}\nonumber
g(\textbf{y}) = -f^{\ast}(-\textbf{A}^T\textbf{y}) - \textbf{b}^T\textbf{y}
\end{equation}

Here $f^{\ast}(\cdot)$ is the convex conjugate of $f(\cdot)$, which can be defined as follow,

\begin{equation}\nonumber
f^{\ast}(\textbf{y}) = \sup_{\textbf{x}}\{\textbf{y}^T\textbf{x}-f(\textbf{x})\}
\end{equation}

Notice the such functional is actually an element in the dual space of the convex function vector space. By substituting the definition of convex conjugate into our problem, we get

\begin{equation}\nonumber
\begin{split}
g(\textbf{y}) &= -\sup_{\textbf{x}}\{-\textbf{y}^T\textbf{Ax}-f(\textbf{x})\} - \textbf{b}^T\textbf{y}\\
&= \inf_{\textbf{x}}\{\textbf{y}^T\textbf{Ax}+f(\textbf{x})\} - \textbf{b}^T\textbf{y}\\
&= p(\textbf{y}) - \textbf{b}^T\textbf{y}
\end{split}
\end{equation}

Lack of knowledge how to compute derivative over functionals like $\displaystyle\sup_{\textbf{x}}(\cdot)$, I just show it from the original definition of derivative.

\begin{equation}\nonumber
\begin{split}
\nabla g(\textbf{y}) &= \frac{\partial g(\textbf{y})}{\partial \textbf{y}} = \frac{\partial p(\textbf{y})}{\partial \textbf{y}} - \textbf{b} = \begin{pmatrix} \frac{\partial p}{\partial \textbf{y}_{1}} \\  \frac{\partial p}{\partial \textbf{y}_{2}} \\ \vdots \\ \frac{\partial p}{\partial \textbf{y}_{n}} \end{pmatrix} - \textbf{b} \\
\frac{\partial p}{\partial \textbf{y}_{i}} = &\lim_{\Delta y \rightarrow 0}\frac{\displaystyle\inf_{\textbf{x}}\{(\textbf{y}+\boldsymbol{\epsilon})^T\textbf{Ax}+f(\textbf{x})\} - \displaystyle\inf_{\textbf{x}}\{\textbf{y}^T\textbf{Ax}+f(\textbf{x})\}}{\Delta y}, \boldsymbol{\epsilon} = \begin{pmatrix} 0 \\ 0 \\ \vdots \\ \boldsymbol{\epsilon}_{i} \\ \vdots \\ 0 \end{pmatrix}, \boldsymbol{\epsilon}_{i} = \Delta y\\
\frac{\partial p}{\partial \textbf{y}_{i}} = &\lim_{\Delta y \rightarrow 0}\frac{\displaystyle\inf_{\textbf{x}}\{\textbf{y}^T\textbf{Ax}+f(\textbf{x})\} + \boldsymbol{\epsilon}^T\textbf{Ax}  - \displaystyle\inf_{\textbf{x}}\{\textbf{y}^T\textbf{Ax}+f(\textbf{x})\}}{\Delta y}\\
= &\lim_{\Delta y \rightarrow 0}\frac{\boldsymbol{\epsilon}^T\textbf{Ax}}{\Delta y} = \hat{\boldsymbol{\epsilon}}^T\textbf{Ax}, \hat{\boldsymbol{\epsilon}} = \begin{pmatrix} 0 \\ 0 \\ \vdots \\ \hat{\boldsymbol{\epsilon}}_{i} \\ \vdots \\ 0 \end{pmatrix}, \hat{\boldsymbol{\epsilon}}_{i} = 1\\
&\Rightarrow \frac{\partial p}{\partial \textbf{y}} = \textbf{Ax}\\
&\Rightarrow \nabla g(\textbf{y}) = \textbf{Ax} - \textbf{b}
\end{split}
\end{equation}

